import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import torch
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import re

json_files = [
    'xxx.json'
]

all_data = []

for file in json_files:
    with open(file, 'r') as f:
        data = json.load(f)
        all_data.extend(data)

df = pd.DataFrame(all_data)

df['text'] = df['text'].astype(str)
df['CoT'] = df['CoT'].astype(str)
df['Label'] = df['Label'].astype(str)  
df = df.dropna(subset=['text', 'CoT', 'Label'])

model_path = "Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True,
    padding_side="right", 
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    emotion_definition = (
            "0=neutral: no clear emotional cues | "
            "1=joy: features like positive lexicon, uplifting emojis, achievement expressions | "
            "2=sadness: contains loss/grief elements, negative event descriptions | "
            "3=surprise: unexpected events or cognitive dissonance | "
            "4=anger: aggressive language, confrontational rhetoric | "
            "5=fear: threat-related content, anxiety indicators | "
            "6=disgust: expressions of revulsion, descriptions of unpleasant events"
        )
    prompts = [
        f"### Emotion Classification Task:\nAnalyze the emotion of this text step by step\nText content: {text}\nReasoning steps: {cot}\nEmotion Label: {label}(0-6 as defined in {emotion_definition})"
        for text, cot, label in zip(examples['text'], examples['CoT'], examples['Label'])
    ]
    
    tokenized_inputs = tokenizer(
        prompts,
        padding="max_length",
        truncation=True,
        max_length=2048,
        return_tensors="pt"
    )
    
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    return tokenized_inputs

train_dataset = Dataset.from_pandas(df)
train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=8)

model = AutoModelForCausalLM.from_pretrained(
    "Llama-3.1-8B",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False, 
)
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    bf16=True,
    logging_steps=10,
    save_steps=500,
    remove_unused_columns=True,
    gradient_checkpointing=True, 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

train_result = trainer.train()
trainer.save_model("best_model_llama_8B_emotion")